import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from lime import lime_tabular
import warnings
warnings.filterwarnings("ignore")
import random
df = pd.read_csv("housing.csv")
df.describe()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
| mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
| std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
| min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
| 25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
| 50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
| 75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
| max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
df = df.dropna()
df["ocean_proximity"].unique()
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
dtype=object)
le = LabelEncoder()
df["ocean_proximity"] = le.fit_transform(df["ocean_proximity"])
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regr = RandomForestRegressor(n_estimators=32, random_state=0)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
metrics.r2_score(y_test, y_pred)
0.8110934679631914
regr.predict(X_test.iloc[[32]])
array([204815.625])
explainer = lime_tabular.LimeTabularExplainer(
X_train.values,
feature_names=X_train.columns.values.tolist(),
mode="regression",
)
random.seed(16)
explainer.explain_instance(X_test.values[32], regr.predict).show_in_notebook()
random.seed(16)
explainer.explain_instance(X_test.values[64], regr.predict).show_in_notebook()
random.seed(16)
explainer.explain_instance(X_test.values[128], regr.predict).show_in_notebook()
random.seed(16)
explainer.explain_instance(X_test.values[256], regr.predict).show_in_notebook()
random.seed(16)
explainer.explain_instance(X_test.values[512], regr.predict).show_in_notebook()
Results of LIME decomposition ensure that pricing mainly depends on location described by 3 variables - longitude, latitude and ocean_proximity. Variable median_income is listed as most influential twice but often its impact is far lesser. So even though we got LIME explainer to work, judging by median_income variable, it may not be sufficiently stable.